
******************************************************************************
********************************data and variable creation*******************
******************************************************************************

use "$raw/all_s6_noPII.dta", clear

replace Newfilm="MISS PEREGRINE" if Newfilm=="MISS PEREGRIE" | Newfilm=="MISS PERE" | Newfilm=="Miss Peregrine" | Newfilm=="mISS PEREGRIE" | Newfilm=="MISS PEREGRENE" | Newfilm=="Miss peregrine"
replace Newfilm="QUEEN OF KATWE" if Newfilm=="QWEEN OF KATWE"
gen female=0 if GENDER!=""
replace female=1 if GENDER=="FEMALE"| GENDER=="female"
tab female
la var female "Female"
drop if AGE==39
gen S4=0 if regexm(S4S6,"6")
replace S4=1 if regexm(S4S6,"4")
la var S4 "S4 class"

tab SCHOOL, gen(school)
gen treatment=0 if Newfilm!=""
replace treatment=1 if Newfilm=="QUEEN OF KATWE" | Newfilm=="QWEEN OF KATWE"
tab treatment
la var treatment "Treatment"

gen today=date("07/10/16", "DMY",2050)
replace today=date("08/10/16", "DMY",2050) if SCHOOL=="Makerere secondary school" | SCHOOL=="Paul mukasa secondary school" 
replace today=date("09/10/16", "DMY",2050) if SCHOOL=="ROYAL COLLEGE" | SCHOOL=="KULUMBA HS"
replace today=date("10/10/16", "DMY",2050) if SCHOOL=="hope boarding secondary school lutembe" | SCHOOL=="Mukono Parents high school" | SCHOOL=="KYANDONDO"
replace today=date("11/10/16", "DMY",2050) if SCHOOL=="DYNAMIC" | SCHOOL=="JAKAYZ S.S.S" | SCHOOL=="ATLAS" | SCHOOL=="GAYAZA MIXED ISLAMIC S.S.S"
format today %td
tab SCHOOL today
la var today "Screening date"

gen session=1 
replace session=2 if SCHOOL=="Makerere secondary school"
replace session=3 if SCHOOL=="Paul mukasa secondary school" 
replace session=4 if SCHOOL=="ROYAL COLLEGE" | SCHOOL=="KULUMBA HS"
replace session=5 if SCHOOL=="hope boarding secondary school lutembe" | SCHOOL=="Mukono Parents high school" 
replace session=6 if SCHOOL=="KYANDONDO"
replace session=7 if SCHOOL=="DYNAMIC" 
replace session=8 if SCHOOL=="JAKAYZ S.S.S" | SCHOOL=="ATLAS" | SCHOOL=="GAYAZA MIXED ISLAMIC S.S.S"
tab SCHOOL session
la var session "Screening session"

gen Q_large=0
replace Q_large=1 if session==1 | session==3 | session==6 | session==7 
la var Q_large "Saw QofK in large screen"

replace KISW=KISWA if KISW=="" & KISWA!=""
*replace IRE=ISLAM if IRE=="" & ISLAM!=""
drop KISWA 
local principal HIST	ECON	GEOG	ENT IRE	AGRIC	LUG	CRE	PHY	MTC	BIO	CHEM	ART	LIT  KISW  ARB TD  
foreach x of local principal {
replace `x'="6" if `x'=="A"
replace `x'="5" if `x'=="B"
replace `x'="4" if `x'=="C"
replace `x'="3" if `x'=="D"
replace `x'="2" if `x'=="E"
replace `x'="2" if `x'=="E "
replace `x'="1" if `x'=="O"
replace `x'="0" if `x'=="F"
destring `x', replace force
gen `x'_pass=0
replace `x'_pass=1 if `x'>1 & `x'!=.
la var `x'_pass "passed `x'"`'
}


local subsidiary SMA CST GEP
foreach x of local subsidiary {
replace `x'=18 if `x'==1
replace `x'=17 if `x'==2
replace `x'=16 if `x'==3
replace `x'=15 if `x'==4
replace `x'=14 if `x'==5
replace `x'=13 if `x'==6
replace `x'=12 if `x'==7
replace `x'=11 if `x'==8
replace `x'=10 if `x'==9
replace `x'=`x'-10
}


gen sub_point=0
replace sub_point=1 if SMA>=3 & SMA!=.
replace sub_point=sub_point+1 if CST>=3 & CST!=.
replace sub_point=sub_point+1 if GEP>=3 & GEP!=.
label variable sub_point "0-2 if passed subsidiary exams"

gen two_sub=0
replace two_sub=1 if SMA!=. & CST!=.


egen subsidiary=rowtotal(SMA CST)
la var subsidiary "SMA or CST score"
gen subsidiary_gep=subsidiary+GEP
la var subsidiary_gep "subsidary and general paper score"

egen principal=rowtotal(`principal')
la var principal "Principal paper score"

egen passes=rowtotal(*_pass)
la var passes "Number of passes in principal papers"
gen uni_pass=0
replace uni_pass=1 if passes>=2
la var uni_pass "Achieved 2 principal passes"

foreach x of local principal {
egen mean_`x' = mean(`x') if treatment==0
egen sd_`x' = sd(`x') if treatment==0
egen mean_`x'2=max(mean_`x')
la var mean_`x'2 "Control mean of `x'"
drop mean_`x'
egen sd_`x'2=max(sd_`x')
la var sd_`x'2 "Control sd of `x' "
drop sd_`x'
gen std_`x' = (`x' - mean_`x'2) / sd_`x'2
la var std_`x'  "Standardised score in `x'"
}

egen std_principal=rowtotal(std_*)
replace std_principal=std_principal/3
la var std_principal "Principal score"

egen mean_sub_point=mean(sub_point) if treatment==0
egen sd_sub_point=sd(sub_point) if treatment==0
egen mean_sub_point2=max(mean_sub_point)
la var mean_sub_point2 "Control mean of sub point"
drop mean_sub_point
egen sd_sub_point2=max(sd_sub_point)
la var sd_sub_point2 "Control sd of sub point"
drop sd_sub_point
gen std_sub_point = (sub_point - mean_sub_point2) / sd_sub_point2
la var std_sub_point "Standarised subsidiary point (0-2)"

*incorrect???????
egen std_POINTS=rowtotal(std_*)
replace std_POINTS=std_POINTS/5
la var std_POINTS "Overall score"


foreach x of local subsidiary {
egen mean_`x' = mean(`x') if treatment==0
egen sd_`x' = sd(`x') if treatment==0
egen mean_`x'2=max(mean_`x')
la var mean_`x'2 "Control mean of `x'"
drop mean_`x'
egen sd_`x'2=max(sd_`x')
la var sd_`x'2 "Control sd of `x'"
drop sd_`x'
gen std_`x' = (`x' - mean_`x'2) / sd_`x'2
la var std_`x' "Standardised score in`x'"
}


egen std_subsidiary=rowtotal(std_SMA std_CST std_GEP)
replace std_subsidiary=std_subsidiary/2
la var std_subsidiary "Subsidiary score"

*added 26/6/19
replace POINTS=principal+sub_point


//renormalise the aggregates
foreach x of varlist std_principal std_subsidiary subsidiary POINTS std_POINTS {
egen mean_`x' = mean(`x') if treatment==0
egen sd_`x' = sd(`x') if treatment==0
egen mean_`x'2=max(mean_`x')
la var mean_`x'2 "Control mean of `x'"
drop mean_`x'
egen sd_`x'2=max(sd_`x')
la var sd_`x'2 "Control sd of `x'"
drop sd_`x'
gen `x'2 = (`x' - mean_`x'2) / sd_`x'2
local label:var la `x'
*local label="Normalised " + lower("`label'")
la var `x'2 "`label'"
}

sum POINTS2

la var AGE "Age"
tab AGE, gen(age_)
gen AGE_sq=AGE^2
la var AGE_sq "Age squared"
egen median_age=median(AGE)
tab median_age
la var median_age "Median age"
gen above_median_age=0
la var above_median_age "Dummy if older than median age"
replace above_median_age=1 if AGE>median_age
gen stem=0
replace stem=1 if MTC!=. | PHY!=. | CHEM!=. | BIO!=. 
la var stem "STEM"
tab stem female

local subjects TD ARB KISW ENT LIT ART CHEM BIO MTC PHY CRE LUG AGRIC IRE GEOG ECON HIST CST SMA
foreach x of local subjects {
gen `x'_dummy=0
replace `x'_dummy=1 if `x'!=.
la var `x'_dummy "takes `x' as a principal paper"
tab `x'_dummy
foreach y of varlist school1-school11{
gen `y'_`x'=`y'*`x'_dummy
}
}

tab MTC_dummy female 



*school level variables
gen christian=0
replace christian=1 if school11==1 | school1==1 | school10==1 | school8==1 | school9==1 | school12==1
la var christian "Attends Christian school"
gen boarding_only=0
replace boarding_only=1 if school11==1 | school12==1 |  school1==1 | school7==1
la var boarding_only "School only has boarding"
gen top_200=0
replace top_200=1 if school12==1 | school10==1 | school6==1 | school9==1 | school11==1 | school7==1
la var top_200 "School ranked in top 200"
gen high_fees=0
replace high_fees=1 if school12==1 | school11==1 | school10==1  |school1==1 
la var high_fees "School has above median fees"

global school_controls christian boarding_only top_200  high_fees

bysort SCHOOL: gen size_school=_N
gen small_school=0
replace small_school=1 if size_school<=65

*dates
gen MTC_date=date("14/11/16", "DMY", 2050) //23%
gen HIST_date=date("14/11/16", "DMY", 2050) //57%
gen ECON_date=date("15/11/16", "DMY", 2050) //42%
gen GEOG_date=date("16/11/16", "DMY", 2050) //19%
gen IRE_date=date("18/11/16", "DMY", 2050) //13%
gen BIO_date=date("18/11/16", "DMY", 2050) //14%
gen CRE_date=date("18/11/16", "DMY", 2050) //23%
gen PHY_date=date("21/11/16", "DMY", 2050) //12%
gen LUG_date=date("21/11/16", "DMY", 2050) //16%
gen KISW_date=date("21/11/16", "DMY", 2050) //2.5%
gen ENT_date=date("23/11/16", "DMY", 2050) //20%
gen LIT_date=date("24/11/16", "DMY", 2050) //4%
gen ART_date=date("24/11/16", "DMY", 2050) //28%
gen AGRIC_date=date("25/11/16", "DMY", 2050) //6%
gen CHEM_date=date("29/11/16", "DMY", 2050) //15%
foreach x of varlist *_date{
la var `x' "Date exam taken"
}

format today %td

foreach x of varlist *_date {
gen `x'_days=`x'-today
la var `x'_days "Days between screening and exam date"
}

foreach x of local principal {
capture replace `x'_date_days=. if `x'==.
}

egen first_exam=rowmin(*_days)
la var first_exam "Earliest exam in days since screening"
*34-44 - median 36
egen mean_exam=rowmean(*_days)
la var mean_exam "Mean days between screening and exams"
*35-48 - median 40 days 
replace first_exam=first_exam-34
gen treatment_first_exam=treatment*first_exam
tab mean_exam
replace mean_exam=mean_exam-35
gen treatment_mean_exam=treatment*mean_exam



// days after screening of your first exam 
egen temp=median(first_exam) // 2 days - 34 days after screening
gen early_exam=0
replace early_exam=1 if first_exam<temp
gen treatment_early_exam=treatment*early_exam
la var early_exam "First exam earlier than median first exam"
la var treatment_early_exam "Treated and first exam earlier than median first exam (36)"

//the mean days after screening of all your exams
egen temp2=median(mean_exam)
gen early_exam2=0
replace early_exam2=1 if mean_exam<temp2
la var early_exam2 "Most exams below the median mean days between screening and exam"
gen treatment_early_exam2=treatment*early_exam2
la var treatment_early_exam2 "Treated and most exams below the median mean days between (40)"

drop temp temp2

*percentage seeing QofK
egen num_students=count(SCHOOL), by(SCHOOL)
la var num_students "Number of students at that school"
egen num_Q=total(treatment), by(SCHOOL)
la var num_Q "Number of students at that school seeing QofK"
gen Q_perc=num_Q/num_students
la var Q_perc "Percentage of students at that school seeing QofK"
gen Q_50=0
replace Q_50=1 if Q_perc>=0.50
la var Q_50 "More than 50% students saw QofK at that school"


*day dummies - just counter act school dummies
tab today, gen(day)
gen day=1 if day1==1
replace day=2 if day2==1
replace day=3 if day3==1
replace day=4 if day4==1
replace day=5 if day5==1
la var day "Day of screening"

///MOCK///
*cleaning for merge 
merge 1:1 id using  "$cleaned/all_s6_mock_cleaned.dta" 
keep if _merge==3
drop _merge

sum POINTS2_mock  std_principal2_mock std_subsidiary2_mock
lab var POINTS2_mock "Mock overall score"
lab var std_principal2_mock "Mock principal score"
la var std_subsidiary2_mock "Mock subsidiary score"
*above median mock score
if ${mock}==1 {
_pctile POINTS2_mock
gen below_median_mock=0
replace below_median_mock=1 if POINTS2_mock<r(r1)
label variable below_median_mock "Below median mock"

gen subsidiary_mock=SMA_mock 
replace subsidiary_mock=  CST_mock if subsidiary_mock==.
_pctile subsidiary_mock
display r(r1)
gen below_median_mock_sub=0 if subsidiary_mock!=.
replace below_median_mock_sub=1 if subsidiary_mock<=r(r1)
label variable below_median_mock_sub "Below median mock"
}

merge 1:1 id using  "$raw/S6_uni_noPII"
drop PROGRAM UNIVERSITY
drop _merge
replace UNI=0 if UNI==.

merge 1:1 id using "$raw/uni_applied_noPII"
drop _merge
gen uni_applied=0 if universityapplied!=""
replace uni_applied=1 if universityapplied=="yes"
drop universityapplied

gen female_stem=female*stem

foreach x of global subgroup { 
capture gen treatment_`x'=treatment*`x'
local label: var la `x'
la var treatment_`x' "Treatment * `label'"
}

capture gen treatment_below_median_mock_sub=treatment*below_median_mock_sub
foreach x of global subgroup2 {
capture gen treatment_`x'=treatment*`x'
local label: var la `x'
label variable treatment_`x' "Treatment * `label'"
capture label variable treatment_`x'_sub "Treatment * `label'"
}


save "$cleaned/all_s6_cleaned.dta", replace
